Author: Nitish Bhardwaj
Date Created: 2nd May 2020
Last Updated: 14th May 2020
#data manipulation imports
import pandas as pd
import numpy as np
#data scrapping imports
from bs4 import BeautifulSoup
import requests
#geocoding imports
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
#visualization libraries
import folium as folium
from folium.plugins import MarkerCluster
import branca #A spinoff for Folium to support HTML+JS
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import plotly.graph_objects as go
#transform JSON file into a pandas dataframe
from pandas.io.json import json_normalize
# import sklearn packages
from sklearn.cluster import KMeans
from sklearn.preprocessing import power_transform, StandardScaler
#import stats to verify the skewness in the data
from scipy import stats
#metric imports
from sklearn.metrics import silhouette_score
#to supress warnings
import warnings
warnings.filterwarnings('ignore')
#Formatting the print statements
class style:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#This data is fetched from https://open.toronto.ca/dataset/neighbourhoods/
df_geoDetails=pd.read_csv("Neighbourhoods.csv")
df_geoDetails.head()
df_geoDetails = df_geoDetails[['AREA_NAME', 'AREA_SHORT_CODE', 'LONGITUDE', 'LATITUDE']]
df_geoDetails['AREA_NAME'] = df_geoDetails['AREA_NAME'].str.replace(r"\s*\([^()]*\)","").str.strip()
df_geoDetails.rename(columns={"AREA_NAME": "Neighbourhood", "AREA_SHORT_CODE": "Neighbourhood_Id" }, inplace=True)
df_geoDetails.sort_values(by=['Neighbourhood_Id']).head()
#This data is fetched from https://open.toronto.ca/dataset/wellbeing-toronto-economics/
df_economics = pd.read_excel("wellbeing-toronto-economics.xlsx")
df_economics.rename(columns={"Neighbourhood Id": "Neighbourhood_Id" }, inplace=True)
df_economics.head()
df_economics = df_economics[['Neighbourhood_Id','Home Prices', 'Child Care Spaces','Local Employment']]
df = pd.merge(df_geoDetails.assign(Neighbourhood_Id=df_geoDetails.Neighbourhood_Id.astype(str)),
df_economics.assign(Neighbourhood_Id=df_economics.Neighbourhood_Id.astype(str)),
on='Neighbourhood_Id')
df.head()
#This data is fetched from https://open.toronto.ca/dataset/wellbeing-toronto-safety/
df_crime = pd.read_excel("wellbeing-toronto-safety.xlsx")
df_crime.rename(columns={"Neighbourhood Id": "Neighbourhood_Id" }, inplace=True)
df_crime.head()
df_crime.drop(columns=['Neighbourhood'], inplace=True)
df = pd.merge(df.assign(Neighbourhood_Id=df.Neighbourhood_Id.astype(str)),
df_crime.assign(Neighbourhood_Id=df_crime.Neighbourhood_Id.astype(str)),
on='Neighbourhood_Id')
df.head()
df.shape
df.info()
Observation:
Neighbourhood and Neighbourhood_Id are object types. Let's keep a note of it. Based on the requirment, we will change the data type.
print(style.BOLD+style.UNDERLINE+"Verify the number of missing values in the dataset:"+style.END)
df.isnull().sum()
Observation:
No null values or missing values are found in the dataset.
df.describe()
#Create a temp dataframe to have a sum of total major crimes
# crime_temp_col = list(df.columns[8:11]) + list(df.columns[14:-2]) + [df.columns[-1]]
crime_temp_col = list(df.columns[8:-2])
crime_temp = df[crime_temp_col].sum().sort_values(ascending=False)
#Visualization for major crimes in Toronto
fig = go.Figure(go.Bar(x=crime_temp.index, y=crime_temp.values,text=crime_temp.values,
textposition='outside',
hovertemplate = "%{x}: %{y} </br>", name='', marker_color='rgb(55, 83, 109)'))
fig.update_layout(
title={'text':'<b>Crime frequency in Toronto</b>',
'x':0.5,'xanchor':'center','font':dict(size=20,color='black')},
xaxis_tickfont_size=14
)
fig.update_traces(marker=dict(line=dict(color='#000000', width=2.5)))
# Update xaxis properties
fig.update_xaxes(title_text="Crime Type", titlefont_size=17, tickfont_size=14)
# Update yaxis properties
fig.update_yaxes(title_text="No. of Incidents", titlefont_size=17, tickfont_size=14)
fig.show()
Observation:
There are high number of Assaults, Break & Enters, Drug Arrests, Robberies, and Vehicle Thefts cases in the Toronto city. This makes it more essential to consider these cases in our clustering features as they have high impact in making a decision to chose a right neighbourhood.
For this business case, we will be removing Fire related crime cases from the dataset to make sure that clustering algorithms gives more consideration to cases like Assaults, Break & Enters etc.
#droping Fire related crimes and Murders
df.drop(columns=['Arsons', 'Fire Medical Calls','Fire Vehicle Incidents','Fires & Fire Alarms','Total Major Crime Incidents',
'Murders'],
inplace=True)
address = 'Toronto, Ontario, Canada'
geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Ontario, Canada are {}, {}.'.format(latitude, longitude))
#Initialize map
intializeMap = folium.Figure(height=400)
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10).add_to(intializeMap)
#Initialize the cluster
mc = folium.plugins.MarkerCluster()
# add markers to map
for lat, lng, neighbourhood in zip(df['LATITUDE'], df['LONGITUDE'],
df['Neighbourhood']):
label = '{}'.format(neighbourhood)
label = folium.Popup(label, parse_html=True)
mc.add_child(folium.CircleMarker(
[lat, lng],
radius=5,
popup=label,
color='blue',
fill=True,
fill_color='#3186cc',
fill_opacity=0.7,
parse_html=False)).add_to(map_toronto)
map_toronto
plt.hist(x=df['Home Prices'], density=False, bins=10) # `density=False` would make counts
plt.title(label="Distribution of Home prices")
plt.ylabel('Number of Neighbourhoods')
_ = plt.xlabel('Home price range')
print(style.BOLD+style.UNDERLINE+"Observation:\n"+style.END)
print("1. There are",
len(df[(df['Home Prices']>300000)&(df['Home Prices']<600000)]['Home Prices']),
"neighbouroods out of",
df.shape[0],
"that have a home price range between $300K to $600K.")
print("\n2. Home price data is right skewed.")
df.columns
fig = plt.figure(figsize = (8,8))
ax = fig.gca()
df.iloc[:,4:].hist(ax=ax)
plt.tight_layout()
print(style.BOLD+style.UNDERLINE+"Raw Data:\n"+style.END)
plt.show()
Observation:
In this section, we will use statistical methods to view the skewness in the dataset and later, attempt to reduce this skewness and see if the data can be transformed to have a normal distribution.
#Latitude and Longitude are sliced (i.e. [-2]) from the result.
df.skew().sort_values(ascending=False)[:-2]
Observation:
Most of the variables have positive skew values. By comparing the skewness result and histograms of the variables it is confirmed that skewness is present in few variables.
As the data have a range of values and not all values are positive, we will be using sklearn's power transform with 'yeo-johnson' method to transform the data.
col=list(df.columns[4:])
df_transformed=df[col]
df_transformed = power_transform(df_transformed, method='yeo-johnson')
#Convert the list into a dataframe
df_transformed = pd.DataFrame(df_transformed,columns=col)
df_transformed
fig = plt.figure(figsize = (8,8))
ax = fig.gca()
# df_transformed.hist(ax=ax)
df_transformed.iloc[:,:-2].hist(ax=ax)
plt.tight_layout()
print(style.BOLD+style.UNDERLINE+"Data post application of yeo-johnson method:\n"+style.END)
plt.show()
Observation:
Home prices have all the values as positive. Let's apply Box cox and see if it can help us get a normal distribution.
#Transforming the Home Prices for normal distribution using boxcox
# df['Home Prices'].skew()
new_homePrice = stats.boxcox(df['Home Prices'])[0]
print("Old Skewscore: ", df['Home Prices'].skew(),
"\nSkewscore post Boxcox:", pd.Series(new_homePrice).skew())
#Result after boxcox
plt.hist(x=new_homePrice, density=False, bins=10) # `density=False` would make counts
plt.title(label="Distribution of Home prices")
plt.ylabel('Number of Neighbourhoods')
plt.xlabel('Home price range')
Observation:
Home price data now exibits fairly normal distribution.
#Replace the Boxcox transformation of Home price data with the Home price data transformed using yeo-johnson method
df_transformed.drop(columns=['Home Prices'], inplace=True)
df_transformed.insert(0,'Home Prices', new_homePrice)
df_transformed.insert(0,'Neighbourhood', df['Neighbourhood'])
print(style.BOLD+style.UNDERLINE+"Transformed Data:\n"+style.END)
df_transformed.head()
CLIENT_ID = '<Enter your Client ID>' # your Foursquare ID
CLIENT_SECRET = '<Enter your Client Secret>' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
'''
A function to fetch the venues near a location within a radius of 500 meters.
names = name of the location
latitude = latitude of the location
longitude = longitude of the location
Return value: nearby venues fetched based on foursquare api
'''
venues_list=[]
for name, lat, lng in zip(names, latitudes, longitudes):
# create the API request URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
lat,
lng,
radius,
LIMIT)
# make the GET request
results = requests.get(url).json()["response"]['groups'][0]['items']
# return only relevant information for each nearby venue
venues_list.append([(
name,
lat,
lng,
v['venue']['name'],
v['venue']['location']['lat'],
v['venue']['location']['lng'],
v['venue']['categories'][0]['name']) for v in results])
nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
nearby_venues.columns = ['Neighbourhood',
'Neighbourhood Latitude',
'Neighbourhood Longitude',
'Venue',
'Venue Latitude',
'Venue Longitude',
'Venue Category']
return(nearby_venues)
toronto_venues = getNearbyVenues(names=df['Neighbourhood'],
latitudes=df['LATITUDE'],
longitudes=df['LONGITUDE'])
print(toronto_venues.shape)
toronto_venues.head()
temp_ = toronto_venues['Neighbourhood'].value_counts().to_frame()
temp_.columns = ['No. of Venues per Neighbourhood']
temp_
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))
In the below steps, one-hot encoding is performed on "Venue Category" column of the dataframe.
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighbourhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood']
# move neighbourhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.head()
print("Shape of one-hot encoded dataframe: ", toronto_onehot.shape)
Group the one-hot encoded dataframe by "Neighbourhood" column and populate the mean. This is also useful to get a rough idea about the frequency of occurence of each category
#Group the dataframe by neighbourhood and compute the mean of the values for each neighbourhood
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.shape
Observation:
After grouping the data based on Neighbourhoods, we should have got 140 rows for each neighbourhood. However, 137 rows above explains that foursquare doesn't have any data for three neighbourhoods given in the origional data.
In the next steps, we will find the neighbourhoods for which foursquare did not have any data and fix the missing values.
# merge toronto_grouped with origional data
toronto_grouped = df.join(toronto_grouped.set_index('Neighbourhood'), on='Neighbourhood')
#Printing the rows for the neighbourhoods for which no data is fetched from FourSquare API
toronto_grouped[toronto_grouped.isna().any(axis=1)]
Observation:
The three neighbourhoods where the values are missing during the data fetch steps in the FourSqure API are given above. We are going the fill these values with the mean of each columns.
toronto_grouped.fillna(toronto_grouped.mean(), inplace=True)
toronto_grouped[toronto_grouped.isna().any(axis=1)]
Observation:
0 rows signifies that no missing values are present in the dataframe now.
#List of columns to be removed from the column for further processing
tempCol = list(df.columns[1:])
tempCol
#drop the above columns
toronto_grouped.drop(columns=tempCol, inplace=True)
print("Shape of one-hot encoded dataframe grouped by the neighbourhood and its mean: ", toronto_grouped.shape)
Create a function to find the top venues by sorting them in descending order.
def return_most_common_venues(row, num_top_venues):
'''
A function to find the top venues by sorting them in descending order.
row: the dataframe having Venue Category mean of frequencies per column.
num_top_venues: number of required top venues
Return type:A dataframe row having all the venues and the corresponding mean frequency arranged in descending order.
'''
#ignore the 1st column as it is the label neighbourhood
row_categories = row.iloc[1:]
#sort the
row_categories_sorted = row_categories.sort_values(ascending=False)
return row_categories_sorted.index.values[0:num_top_venues]
Create column names for the dataframe i.e. top 10 venues and call the above function on each neighbourhood of Toronto
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
try:
columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
except:
columns.append('{}th Most Common Venue'.format(ind+1))
# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']
for ind in np.arange(toronto_grouped.shape[0]):
#Send the ith(ind) row of the dataframe and all columns to the function on each iteration
neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
print(style.BOLD+style.UNDERLINE+"Neighbourhoods and their top 10 common venues:\n"+style.END)
neighbourhoods_venues_sorted.head()
# merge neighbourhoods_venues_sorted with origional data
df_Final = df.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')
print(style.BOLD+style.UNDERLINE+"The final dataframe having top 10 venues and all other details:\n"+style.END)
df_Final.head()
For K means, selection of K is an important factor. For this project, Elbow method and Silhouette score is used to decide the optimum value of k.
# merge one hot encoded, standardized venue data toronto_grouped with crime and
# home prices transformed data (df_transformed)
toronto_grouped_clustering = df_transformed.join(toronto_grouped.set_index('Neighbourhood'), on='Neighbourhood')
#drop Neighbourhood column from the dataframe as it is having categorical value and holds no significance for clustering
toronto_grouped_clustering.drop(columns=['Neighbourhood'], inplace=True)
#Create two empty list and populate the value of inertia and silhouette score in them
distortions = []
silhouette = []
K = range(2,10)
for k in K:
kmeanModel = KMeans(n_clusters= k, init='k-means++', random_state=0)
kmeanModel.fit(toronto_grouped_clustering)
#Elbow method
distortions.append(kmeanModel.inertia_)
#Silhouette score
silhouette.append(silhouette_score(toronto_grouped_clustering, kmeanModel.labels_, metric = 'euclidean'))
#plot graphs of Elbow method and Silhouette score
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
axes[0].plot(K, distortions, marker='o', markersize=10)
axes[0].set(xlabel='K', ylabel='Distortion')
axes[0].set_title('Elbow Method showing the optimal k')
silhouetteTemp = []
axes[1].plot(range(3,9), silhouette[1:-1], marker='o', markersize=10)
axes[1].set(xlabel='K', ylabel='Silhouette score')
axes[1].set_title('Silhouette score showing the optimal k')
fig.suptitle('Finding optimum K value for K-means by Elbow method and Silhouette score', fontsize=16, y=1.05)
fig.subplots_adjust(wspace = 0.5)
Hence, we will make 3 clusters in this process.
# set number of clusters
kclusters = 3
#drop Neighbourhood column from the dataframe as it is having categorical value and holds no significance for clustering
toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)
# run k-means clustering
kmeans = KMeans(init="k-means++", n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_
# add clustering labels
df_Final.insert(0, 'Cluster Labels', kmeans.labels_)
df_Final.head()
#Initialize map
intializeMap = folium.Figure(height=400)
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10).add_to(intializeMap)
#convert the cluster from float to int
df_Final.dropna(inplace=True)
df_Final = df_Final.astype({"Cluster Labels": int})
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_Final['LATITUDE'],
df_Final['LONGITUDE'],
df_Final['Neighbourhood'],
df_Final['Cluster Labels']):
label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
folium.CircleMarker(
[lat, lon],
radius=5,
popup=label,
color=rainbow[cluster-1],
fill=True,
fill_color=rainbow[cluster-1],
fill_opacity=0.7).add_to(map_toronto)
map_toronto
#Keeping only the required features in the dataframe for cluster analysis
df_Final.drop(columns=['Neighbourhood_Id','LATITUDE','LONGITUDE'], inplace=True)
#set the pandas option to show all the columns in the dataframe
pd.set_option('display.max_columns', None)
df_Final[df_Final['Cluster Labels'] == 0]
df_Final[df_Final['Cluster Labels'] == 1]
#set the float format of pandas to display long values with 2 decimal places
pd.set_option('display.float_format', lambda x: '%.2f' % x)
df_Final[df_Final['Cluster Labels'] == 1].describe()
df_Final[df_Final['Cluster Labels'] == 2].head(10)
df_Final[df_Final['Cluster Labels'] == 2].describe()
#Reset all pandas option back to default
pd.reset_option('display.max_rows', 'display.max_columns')
pd.reset_option('display.float_format')
values = [['<br>Cluster 1', '<br>Cluster 2','<br>Cluster 3'],
[
"""
<b>House Price Range:</b> ~550K<br>
<b>Childcare Spaces:</b> Low to Medium<br>
<b>Crime Rate:</b> High. Common cases of Assaults, Break & Enter, and Hazardous Incidents.<br>
<b>Nearby Venues:</b> Less restaurants, photography studio and Farms, Farmers market<br>
<b>Local Employment:</b> Medium
""",
"""
<b>House Price Range:</b> 250K to 600K<br>
<b>Childcare Spaces:</b> Medium to High. Some neighbourhoods don't have childcare spaces<br>
<b>Crime Rate:</b> Medium to High. Common cases of Assaults, Break & Enter.<br>
<b>Nearby Venues:</b> Restaunrants, coffee shops, pubs and bars<br>
<b>Local Employment:</b> Medium to High
""",
"""
<b>House Price Range:</b> 250K to 800K<br>
<b>Childcare Spaces:</b> Medium to High (Each nighbourhood has a childcare space)<br>
<b>Crime Rate:</b> Low to High. Common cases of Assaults, Hazardous Incidents, and Thefts.<br>
<b>Nearby Venues:</b> Playgrounds or parks, restaurants, pubs/bars, gym and clothing store<br>
<b>Local Employment:</b> Medium to High
"""
],
[df_Final[df_Final['Cluster Labels'] == 0].shape[0],
df_Final[df_Final['Cluster Labels'] == 1].shape[0],
df_Final[df_Final['Cluster Labels'] == 2].shape[0]
]
]
fig = go.Figure(data=[go.Table(
columnorder = [1,2,3],
columnwidth = [80,440,110],
header = dict(
values = [ ['<b>Cluster/<br>Segments</b>'],
['<b>Cluster/Segment<br> Details</b>'],
['<b>Number of Neighbourhoods</b>']],
line_color='darkslategray',
fill_color='royalblue',
align=['left','center'],
font=dict(color='white', size=14),
height=40
),
cells=dict(
values=values,
line_color='darkslategray',
fill=dict(color=['paleturquoise', 'white']),
align=['left','left','center'],
font_size=12,
height=30)
)
])
fig.show()